Load the data - are these dates errors or are the samples that were found later in the season

library(ggplot2)
library(stringr)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-6. For overview type 'help("mgcv-package")'.
set.seed(123)
data_length <- read.csv("~/mysis/data/lengthData.csv", skip = 4, header = TRUE)
net <- data_length$Size..m.
y <- data_length$Length..mm.
gender <- as.factor(substr(str_trim(as.character(data_length$Gender)), 0, 1))
                       ## remove whitespace from string, then truncate into 
                       ## broad gender classes, not subclasses

date <- as.numeric(data_length$Date)
date[date == 2] <- 1      ## collapse into monthly values
date[date == 3] <- 2
date[date == 4] <- 3
date <- as.factor(date)
plot(as.numeric(date), type='l', main = "Is this a data error? - yes?")

## correct date mislabeling
date <- as.numeric(data_length$Date)
date[7330:7519] <- 4
date[7559:7773] <- 4
date[date == 2] <- 1      ## collapse into monthly values
date[date == 3] <- 2
date[date == 4] <- 3
date <- as.factor(date)
plot(as.numeric(date), type='l', main = "Data looks better")

data <- data.frame(y=y, date=date, net=net, gender=gender)
X <- model.matrix(~ net + date + gender, data=data)
ggplot(data=data, aes(y)) + geom_histogram() + facet_grid(net ~ date)

idx <- attr(uniquecombs(X), "index")

Write the model

\(t\) = time \(i\) = net size \(j\) = age class \(k\) = replicate

\[ y_{tijk} \sim \mbox{N}(\mu_{tij}, \sigma^2_{tij}) \]

## Loading required package: Rcpp
## Loading required package: inline
## 
## Attaching package: 'inline'
## 
## The following object is masked from 'package:Rcpp':
## 
##     registerPlugin
## 
## rstan (Version 2.5.0, packaged: 2014-10-22 14:19:22 UTC, GitRev: e52c66f42e81)
## 
## TRANSLATING MODEL 'lengthHeterogeneous' FROM Stan CODE TO C++ CODE NOW.
## COMPILING THE C++ CODE FOR MODEL 'lengthHeterogeneous' NOW.
## In file included from file1c376ce59b92.cpp:421:
## In file included from /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include/rstan/rstaninc.hpp:3:
## /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include/rstan/stan_fit.hpp:732:15: warning: unused variable 'return_code' [-Wunused-variable]
##           int return_code = stan::common::do_bfgs_optimize(model, lbfgs, base_rng,
##               ^
## /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include/rstan/stan_fit.hpp:796:15: warning: unused variable 'return_code' [-Wunused-variable]
##           int return_code = stan::common::do_bfgs_optimize(model, bfgs, base_rng,
##               ^
## /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include/rstan/stan_fit.hpp:616:14: warning: unused variable 'init_log_prob' [-Wunused-variable]
##       double init_log_prob;
##              ^
## In file included from file1c376ce59b92.cpp:8:
## In file included from /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include//stansrc/stan/model/model_header.hpp:16:
## In file included from /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include//stansrc/stan/agrad/rev.hpp:5:
## /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include//stansrc/stan/agrad/rev/chainable.hpp:87:17: warning: 'static' function 'set_zero_all_adjoints' declared in header file should be declared 'static inline' [-Wunneeded-internal-declaration]
##     static void set_zero_all_adjoints() {
##                 ^
## In file included from file1c376ce59b92.cpp:8:
## In file included from /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include//stansrc/stan/model/model_header.hpp:23:
## /Library/Frameworks/R.framework/Versions/3.1/Resources/library/rstan/include//stansrc/stan/io/dump.hpp:26:14: warning: function 'product' is not needed and will not be emitted [-Wunneeded-internal-declaration]
##       size_t product(std::vector<size_t> dims) {
##              ^
## 5 warnings generated.
## the number of chains is less than 1; sampling not done

Fit the Model

Model output

m
## Inference for Stan model: lengthHeterogeneous.
## 4 chains, each with iter=5000; warmup=2500; thin=1; 
## post-warmup draws per chain=2500, total post-warmup draws=10000.
## 
##             mean se_mean   sd     2.5%      25%      50%      75%    97.5%
## beta[1]    15.38    0.00 0.13    15.12    15.29    15.38    15.46    15.63
## beta[2]     0.10    0.00 0.08    -0.05     0.05     0.10     0.15     0.25
## beta[3]     1.38    0.00 0.04     1.31     1.36     1.38     1.41     1.46
## beta[4]     1.91    0.00 0.05     1.82     1.88     1.92     1.95     2.01
## beta[5]    -8.61    0.00 0.11    -8.82    -8.68    -8.61    -8.53    -8.39
## beta[6]    -1.61    0.00 0.12    -1.84    -1.69    -1.61    -1.53    -1.38
## beta[7]    -4.82    0.00 0.12    -5.05    -4.91    -4.82    -4.74    -4.59
## s[1]        1.96    0.00 0.23     1.57     1.79     1.94     2.10     2.46
## s[2]        2.85    0.00 0.22     2.45     2.69     2.83     2.99     3.32
## s[3]        1.98    0.00 0.11     1.78     1.90     1.98     2.05     2.21
## s[4]        1.33    0.00 0.04     1.26     1.31     1.33     1.36     1.40
## s[5]        2.53    0.00 0.29     2.03     2.32     2.51     2.71     3.18
## s[6]        1.65    0.00 0.11     1.46     1.58     1.65     1.72     1.88
## s[7]        1.94    0.00 0.15     1.67     1.83     1.93     2.04     2.27
## s[8]        0.83    0.00 0.06     0.72     0.79     0.83     0.87     0.96
## s[9]        1.47    0.00 0.18     1.18     1.35     1.46     1.58     1.87
## s[10]       2.02    0.00 0.12     1.79     1.93     2.01     2.10     2.27
## s[11]       2.05    0.00 0.16     1.77     1.94     2.04     2.15     2.39
## s[12]       0.97    0.00 0.05     0.87     0.93     0.97     1.00     1.07
## s[13]       2.53    0.00 0.13     2.28     2.43     2.52     2.61     2.79
## s[14]       3.10    0.00 0.13     2.87     3.02     3.10     3.18     3.36
## s[15]       1.89    0.00 0.07     1.77     1.84     1.89     1.93     2.02
## s[16]       1.31    0.00 0.02     1.28     1.30     1.31     1.32     1.35
## s[17]       2.50    0.00 0.12     2.28     2.42     2.50     2.58     2.74
## s[18]       1.78    0.00 0.05     1.68     1.74     1.78     1.81     1.88
## s[19]       1.79    0.00 0.06     1.67     1.75     1.79     1.83     1.92
## s[20]       0.78    0.00 0.04     0.71     0.75     0.78     0.80     0.86
## s[21]       2.28    0.00 0.15     2.00     2.17     2.27     2.37     2.60
## s[22]       1.96    0.00 0.07     1.83     1.91     1.95     2.00     2.10
## s[23]       1.93    0.00 0.09     1.77     1.87     1.93     1.99     2.12
## s[24]       1.06    0.00 0.03     1.01     1.04     1.06     1.08     1.12
## lp__    -8467.68    0.06 3.94 -8476.34 -8470.17 -8467.34 -8464.88 -8461.00
##         n_eff Rhat
## beta[1]  4635    1
## beta[2] 10000    1
## beta[3] 10000    1
## beta[4]  8347    1
## beta[5]  4344    1
## beta[6]  4523    1
## beta[7]  4799    1
## s[1]    10000    1
## s[2]    10000    1
## s[3]    10000    1
## s[4]    10000    1
## s[5]    10000    1
## s[6]    10000    1
## s[7]    10000    1
## s[8]    10000    1
## s[9]    10000    1
## s[10]   10000    1
## s[11]   10000    1
## s[12]   10000    1
## s[13]   10000    1
## s[14]   10000    1
## s[15]   10000    1
## s[16]   10000    1
## s[17]   10000    1
## s[18]   10000    1
## s[19]   10000    1
## s[20]   10000    1
## s[21]   10000    1
## s[22]   10000    1
## s[23]   10000    1
## s[24]   10000    1
## lp__     4508    1
## 
## Samples were drawn using NUTS(diag_e) at Fri Sep  4 12:35:27 2015.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Plot traceplots

rstan::traceplot(m, inc_warmup = FALSE)

Plot fitted vs. simulated values

e <- rstan::extract(m, pars = c("beta", "s"))
par(mfrow = c(3, 3))
for(i in 1:2) {
  if(i == 1) {
    for(j in 1:dim(e[[i]])[2]) {
      plot(density(e[[i]][,j]), main = paste("beta", j, sep=""))
    }
  } else if(i == 2) {
    plot(density(e[[i]]), main = "s")
  }
}

library(ggmcmc)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:nlme':
## 
##     collapse
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Loading required package: tidyr
## 
## Attaching package: 'tidyr'
## 
## The following object is masked from 'package:rstan':
## 
##     extract
library(coda)
## 
## Attaching package: 'coda'
## 
## The following object is masked from 'package:rstan':
## 
##     traceplot
s <-  mcmc.list(lapply(1:ncol(m), function(x) mcmc(as.array(m)[,x,])))
S <- ggs(s)


ggs_caterpillar(S, family = "beta") 

library(Gmisc)
## Loading required package: htmlTable
tableData <- cbind(summary(s)$statistics[1:(dim(e[[1]])[2] + 1), 1],
                   summary(s)$quantiles[1:(dim(e[[1]])[2] + 1), c(1, 5)])
htmlTable(x=round(tableData, digits=2),
          caption=paste("Regression parameter estimates and CI's"),
          header = c("Estimate", "Lower 95% CI", "Upper 95% CI"),
          rowlabel = "Variables",
          rnames = c("Intercept",
                    "Net Size",
                    "August",
                    "September",
                    "Juvenile",
                    "Male",
                    "Unknown",
                    "standard deviation"))
Regression parameter estimates and CI’s
Variables Estimate Lower 95% CI Upper 95% CI
Intercept 15.38 15.12 15.63
Net Size 0.1 -0.05 0.25
August 1.38 1.31 1.46
September 1.91 1.82 2.01
Juvenile -8.61 -8.82 -8.39
Male -1.61 -1.84 -1.38
Unknown -4.82 -5.05 -4.59
standard deviation 1.96 1.57 2.46
summary(lm(y ~ net + date + gender))
## 
## Call:
## lm(formula = y ~ net + date + gender)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1201  -1.1032  -0.0632   1.0397  10.9368 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 15.73781    0.10216  154.049  < 2e-16 ***
## net          0.22246    0.08634    2.577  0.00999 ** 
## date2        0.97987    0.04592   21.336  < 2e-16 ***
## date3        0.90260    0.04660   19.368  < 2e-16 ***
## genderJ     -8.97707    0.06594 -136.130  < 2e-16 ***
## genderM     -1.63806    0.07241  -22.621  < 2e-16 ***
## genderU     -4.38327    0.07085  -61.870  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.647 on 9120 degrees of freedom
## Multiple R-squared:  0.8267, Adjusted R-squared:  0.8266 
## F-statistic:  7252 on 6 and 9120 DF,  p-value: < 2.2e-16
plot(lm(y ~ net + date + gender))

library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
summary(rlm(y ~ net + date + gender))
## 
## Call: rlm(formula = y ~ net + date + gender)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2080  -1.0124  -0.0275   1.0325  11.0125 
## 
## Coefficients:
##             Value     Std. Error t value  
## (Intercept)   15.7354    0.0950   165.5745
## net            0.1551    0.0803     1.9309
## date2          1.1375    0.0427    26.6261
## date3          1.1334    0.0434    26.1438
## genderJ       -8.9830    0.0613  -146.4337
## genderM       -1.5479    0.0674   -22.9787
## genderU       -4.7710    0.0659   -72.3921
## 
## Residual standard error: 1.516 on 9120 degrees of freedom
plot(rlm(y ~ net + date + gender))